import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import plotly.express as px
import plotly.io as pio
%matplotlib inline
pio.renderers.default = "plotly_mimetype+notebook"
df = pd.read_csv('gapminder_clean.csv')
df.describe
<bound method NDFrame.describe of Unnamed: 0 Country Name Year Agriculture, value added (% of GDP) \
0 0 Afghanistan 1962 NaN
1 1 Afghanistan 1967 NaN
2 2 Afghanistan 1972 NaN
3 3 Afghanistan 1977 NaN
4 4 Afghanistan 1982 NaN
... ... ... ... ...
2602 2602 Zimbabwe 1987 14.407528
2603 2603 Zimbabwe 1992 7.413793
2604 2604 Zimbabwe 1997 18.934082
2605 2605 Zimbabwe 2002 14.029007
2606 2606 Zimbabwe 2007 21.597907
CO2 emissions (metric tons per capita) \
0 0.073781
1 0.123782
2 0.130820
3 0.183118
4 0.165879
... ...
2602 1.598217
2603 1.533724
2604 1.194678
2605 0.942795
2606 0.743449
Domestic credit provided by financial sector (% of GDP) \
0 21.276422
1 9.917662
2 18.880833
3 13.836822
4 NaN
... ...
2602 74.161607
2603 43.120518
2604 63.058320
2605 164.559047
2606 NaN
Electric power consumption (kWh per capita) \
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
... ...
2602 878.072691
2603 778.695133
2604 870.988697
2605 827.329873
2606 700.642317
Energy use (kg of oil equivalent per capita) \
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
... ...
2602 896.673611
2603 923.493407
2604 804.508892
2605 772.676619
2606 725.681049
Exports of goods and services (% of GDP) \
0 4.878051
1 6.772908
2 14.763231
3 11.662904
4 NaN
... ...
2602 24.015710
2603 27.227263
2604 37.595273
2605 31.834799
2606 37.785373
Fertility rate, total (births per woman) GDP growth (annual %) \
0 7.450 NaN
1 7.450 NaN
2 7.450 NaN
3 7.449 NaN
4 7.450 NaN
... ... ...
2602 5.784 1.150737
2603 4.840 -9.015570
2604 4.237 2.680594
2605 4.018 -8.894023
2606 4.022 -3.653327
Imports of goods and services (% of GDP) \
0 9.349593
1 14.209827
2 18.105850
3 14.823175
4 NaN
... ...
2602 21.274886
2603 36.485231
2604 44.609791
2605 34.972553
2606 46.387531
Industry, value added (% of GDP) Inflation, GDP deflator (annual %) \
0 NaN NaN
1 NaN NaN
2 NaN NaN
3 NaN NaN
4 NaN NaN
... ... ...
2602 32.451242 7.189361
2603 40.862069 -14.129659
2604 25.554678 -2.879048
2605 NaN 2.712950
2606 33.074953 0.894887
Life expectancy at birth, total (years) \
0 33.219902
1 35.389415
2 37.610146
3 40.110146
4 43.230732
... ...
2602 61.753805
2603 56.491976
2604 46.065902
2605 40.679146
2606 44.177756
Population density (people per sq. km of land area) \
0 14.312061
1 15.881812
2 17.947027
3 19.998926
4 19.402324
... ...
2602 24.649495
2603 28.485762
2604 31.174507
2605 32.807111
2606 34.374559
Services, etc., value added (% of GDP) pop continent gdpPercap
0 NaN 10267083.0 Asia 853.100710
1 NaN 11537966.0 Asia 836.197138
2 NaN 13079460.0 Asia 739.981106
3 NaN 14880372.0 Asia 786.113360
4 NaN 12881816.0 Asia 978.011439
... ... ... ... ...
2602 53.141236 9216418.0 Africa 706.157306
2603 51.724135 10704340.0 Africa 693.420786
2604 55.511236 11404948.0 Africa 792.449960
2605 NaN 11926563.0 Africa 672.038623
2606 45.327139 12311143.0 Africa 469.709298
[2607 rows x 20 columns]>
df.head()
| Unnamed: 0 | Country Name | Year | Agriculture, value added (% of GDP) | CO2 emissions (metric tons per capita) | Domestic credit provided by financial sector (% of GDP) | Electric power consumption (kWh per capita) | Energy use (kg of oil equivalent per capita) | Exports of goods and services (% of GDP) | Fertility rate, total (births per woman) | GDP growth (annual %) | Imports of goods and services (% of GDP) | Industry, value added (% of GDP) | Inflation, GDP deflator (annual %) | Life expectancy at birth, total (years) | Population density (people per sq. km of land area) | Services, etc., value added (% of GDP) | pop | continent | gdpPercap | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Afghanistan | 1962 | NaN | 0.073781 | 21.276422 | NaN | NaN | 4.878051 | 7.450 | NaN | 9.349593 | NaN | NaN | 33.219902 | 14.312061 | NaN | 10267083.0 | Asia | 853.100710 |
| 1 | 1 | Afghanistan | 1967 | NaN | 0.123782 | 9.917662 | NaN | NaN | 6.772908 | 7.450 | NaN | 14.209827 | NaN | NaN | 35.389415 | 15.881812 | NaN | 11537966.0 | Asia | 836.197138 |
| 2 | 2 | Afghanistan | 1972 | NaN | 0.130820 | 18.880833 | NaN | NaN | 14.763231 | 7.450 | NaN | 18.105850 | NaN | NaN | 37.610146 | 17.947027 | NaN | 13079460.0 | Asia | 739.981106 |
| 3 | 3 | Afghanistan | 1977 | NaN | 0.183118 | 13.836822 | NaN | NaN | 11.662904 | 7.449 | NaN | 14.823175 | NaN | NaN | 40.110146 | 19.998926 | NaN | 14880372.0 | Asia | 786.113360 |
| 4 | 4 | Afghanistan | 1982 | NaN | 0.165879 | NaN | NaN | NaN | NaN | 7.450 | NaN | NaN | NaN | NaN | 43.230732 | 19.402324 | NaN | 12881816.0 | Asia | 978.011439 |
#Filter the data to include only rows where Year is 1962.
df_1962 = df[df['Year'] == 1962]
#make a scatter plot comparing 'CO2 emissions (metric tons per capita)' and gdpPercap for the filtered data.
plt.scatter(df_1962['CO2 emissions (metric tons per capita)'], df_1962['gdpPercap'])
plt.title('CO2 Emissions vs. GDP per capita (Year 1962)')
plt.xlabel('CO2 emissions (metric tons per capita)')
plt.ylabel('GDP per capita')
plt.show()
df_1962['gpd_log'] = np.log(df_1962['gdpPercap'])
df_1962['co2_log'] = np.log(df_1962['CO2 emissions (metric tons per capita)'])
fig = px.scatter(df_1962, x="co2_log", y="gpd_log", title='log(CO2 Emissions) vs. log(GDP per capita (Year 1962))')
fig.show()
/var/folders/tx/lt873txs0695kyyx9kbh1gcc0000gn/T/ipykernel_56879/3866044842.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /var/folders/tx/lt873txs0695kyyx9kbh1gcc0000gn/T/ipykernel_56879/3866044842.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_1962.dropna(subset=['CO2 emissions (metric tons per capita)', 'gdpPercap'], inplace=True)
corr, p_value = stats.pearsonr(df_1962['CO2 emissions (metric tons per capita)'], df_1962['gdpPercap'])
print("Correlation:", corr)
print("P-value:", p_value)
Correlation: 0.9260816725019473 P-value: 1.1286792210036883e-46
/var/folders/tx/lt873txs0695kyyx9kbh1gcc0000gn/T/ipykernel_56879/934864285.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_dropna = df.dropna(subset=['CO2 emissions (metric tons per capita)', 'gdpPercap'], inplace=False)
corr_by_year = df_dropna[['Year','CO2 emissions (metric tons per capita)','gdpPercap']].groupby('Year').corr(method='pearson')
corr_by_year.rename(index={0:'year', 1: 'va1', 2: 'co2', 3:'gpd'})
corr_by_year
| CO2 emissions (metric tons per capita) | gdpPercap | ||
|---|---|---|---|
| Year | |||
| 1962 | CO2 emissions (metric tons per capita) | 1.000000 | 0.926082 |
| gdpPercap | 0.926082 | 1.000000 | |
| 1967 | CO2 emissions (metric tons per capita) | 1.000000 | 0.938792 |
| gdpPercap | 0.938792 | 1.000000 | |
| 1972 | CO2 emissions (metric tons per capita) | 1.000000 | 0.842899 |
| gdpPercap | 0.842899 | 1.000000 | |
| 1977 | CO2 emissions (metric tons per capita) | 1.000000 | 0.792834 |
| gdpPercap | 0.792834 | 1.000000 | |
| 1982 | CO2 emissions (metric tons per capita) | 1.000000 | 0.816638 |
| gdpPercap | 0.816638 | 1.000000 | |
| 1987 | CO2 emissions (metric tons per capita) | 1.000000 | 0.809553 |
| gdpPercap | 0.809553 | 1.000000 | |
| 1992 | CO2 emissions (metric tons per capita) | 1.000000 | 0.809432 |
| gdpPercap | 0.809432 | 1.000000 | |
| 1997 | CO2 emissions (metric tons per capita) | 1.000000 | 0.808140 |
| gdpPercap | 0.808140 | 1.000000 | |
| 2002 | CO2 emissions (metric tons per capita) | 1.000000 | 0.800642 |
| gdpPercap | 0.800642 | 1.000000 | |
| 2007 | CO2 emissions (metric tons per capita) | 1.000000 | 0.720417 |
| gdpPercap | 0.720417 | 1.000000 |
strong_year = 1967
print(f'the year of the strongest correlation: {strong_year}')
the year of the strongest correlation: 1967
df_strongest_corr_year = df[df['Year'] == strong_year]
df_strongest_corr_year.head(5)
| Unnamed: 0 | Country Name | Year | Agriculture, value added (% of GDP) | CO2 emissions (metric tons per capita) | Domestic credit provided by financial sector (% of GDP) | Electric power consumption (kWh per capita) | Energy use (kg of oil equivalent per capita) | Exports of goods and services (% of GDP) | Fertility rate, total (births per woman) | GDP growth (annual %) | Imports of goods and services (% of GDP) | Industry, value added (% of GDP) | Inflation, GDP deflator (annual %) | Life expectancy at birth, total (years) | Population density (people per sq. km of land area) | Services, etc., value added (% of GDP) | pop | continent | gdpPercap | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | Afghanistan | 1967 | NaN | 0.123782 | 9.917662 | NaN | NaN | 6.772908 | 7.450 | NaN | 14.209827 | NaN | NaN | 35.389415 | 15.881812 | NaN | 11537966.0 | Asia | 836.197138 |
| 11 | 11 | Albania | 1967 | NaN | 1.363746 | NaN | NaN | NaN | NaN | 5.394 | NaN | NaN | NaN | NaN | 66.287220 | 71.737153 | NaN | 1984060.0 | Europe | 2760.196931 |
| 21 | 21 | Algeria | 1967 | 10.330666 | 0.632118 | 27.977088 | NaN | NaN | 23.434417 | 7.672 | 9.452963 | 21.631769 | 42.385889 | 1.312041 | 49.187512 | 5.606908 | 47.28345 | 12760499.0 | Africa | 3246.991771 |
| 31 | 31 | American Samoa | 1967 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 125.580000 | NaN | NaN | NaN | NaN |
| 41 | 41 | Andorra | 1967 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 44.159574 | NaN | NaN | NaN | NaN |
df_scatter = df_strongest_corr_year[~df_strongest_corr_year['pop'].isnull()]
df_scatter.head(5)
| Unnamed: 0 | Country Name | Year | Agriculture, value added (% of GDP) | CO2 emissions (metric tons per capita) | Domestic credit provided by financial sector (% of GDP) | Electric power consumption (kWh per capita) | Energy use (kg of oil equivalent per capita) | Exports of goods and services (% of GDP) | Fertility rate, total (births per woman) | GDP growth (annual %) | Imports of goods and services (% of GDP) | Industry, value added (% of GDP) | Inflation, GDP deflator (annual %) | Life expectancy at birth, total (years) | Population density (people per sq. km of land area) | Services, etc., value added (% of GDP) | pop | continent | gdpPercap | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | Afghanistan | 1967 | NaN | 0.123782 | 9.917662 | NaN | NaN | 6.772908 | 7.450 | NaN | 14.209827 | NaN | NaN | 35.389415 | 15.881812 | NaN | 11537966.0 | Asia | 836.197138 |
| 11 | 11 | Albania | 1967 | NaN | 1.363746 | NaN | NaN | NaN | NaN | 5.394 | NaN | NaN | NaN | NaN | 66.287220 | 71.737153 | NaN | 1984060.0 | Europe | 2760.196931 |
| 21 | 21 | Algeria | 1967 | 10.330666 | 0.632118 | 27.977088 | NaN | NaN | 23.434417 | 7.672 | 9.452963 | 21.631769 | 42.385889 | 1.312041 | 49.187512 | 5.606908 | 47.283450 | 12760499.0 | Africa | 3246.991771 |
| 51 | 51 | Angola | 1967 | NaN | 0.166659 | NaN | NaN | NaN | NaN | 7.401 | NaN | NaN | NaN | NaN | 35.799976 | 4.782892 | NaN | 5247469.0 | Africa | 5522.776375 |
| 81 | 81 | Argentina | 1967 | 9.980386 | 2.858163 | 18.651542 | NaN | NaN | 7.495596 | 3.048 | 3.191997 | 4.997064 | 47.383414 | 29.018263 | 65.930024 | 8.379539 | 42.636201 | 22934225.0 | Americas | 8052.953021 |
continent = df_scatter['continent'].unique()
continent
array(['Asia', 'Europe', 'Africa', 'Americas', 'Oceania'], dtype=object)
df_scatter['gpd_log'] = np.log(df_scatter['gdpPercap'])
df_scatter['co2_log'] = np.log(df_scatter['CO2 emissions (metric tons per capita)'])
fig = px.scatter(df_scatter, x="co2_log", y="gpd_log", size="pop", color="continent")
fig.show()
/var/folders/tx/lt873txs0695kyyx9kbh1gcc0000gn/T/ipykernel_56879/1693796355.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /var/folders/tx/lt873txs0695kyyx9kbh1gcc0000gn/T/ipykernel_56879/1693796355.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# boxplot
sns.boxplot(data=df,x='continent',y='Energy use (kg of oil equivalent per capita)')
plt.title('Energy use by continent')
plt.ylabel('Energy use (kg of oil equivalent per capita)')
plt.xlabel('continent')
# ANOVA test
df_1 = df[~df['continent'].isnull()]
df_1 = df_1[~df_1['Energy use (kg of oil equivalent per capita)'].isnull()]
df_1.head(5)
fval, pval = stats.f_oneway(df_1[df_1['continent'] == 'Africa']['Energy use (kg of oil equivalent per capita)'],
df_1[df_1['continent'] == 'Americas']['Energy use (kg of oil equivalent per capita)'],
df_1[df_1['continent'] == 'Asia']['Energy use (kg of oil equivalent per capita)'],
df_1[df_1['continent'] == 'Europe']['Energy use (kg of oil equivalent per capita)'],
df_1[df_1['continent'] == 'Oceania']['Energy use (kg of oil equivalent per capita)'])
print(f'ANOVA F-value: {fval}')
print(f'ANOVA p-value: {pval}')
ANOVA F-value: 51.45915759042634 ANOVA p-value: 8.527003487154367e-39
# line plots
df_Europe = df_1[(df_1['continent'] == 'Europe') & (df_1['Year'] > 1990) & ~df_1['Imports of goods and services (% of GDP)'].isnull()]
df_Asia = df_1[(df_1['continent'] == 'Asia') & (df_1['Year'] >1990) & ~df_1['Imports of goods and services (% of GDP)'].isnull()]
plt.plot(df_Europe['Year'],df_Europe['Imports of goods and services (% of GDP)'], label = 'Europe')
plt.plot(df_Asia['Year'],df_Asia['Imports of goods and services (% of GDP)'], label = 'Asia')
plt.title('Imports of goods and services (% of GDP) in Europe and Asia after 1990')
plt.xlabel('Year')
plt.ylabel('Imports of goods and services (% of GDP)')
plt.legend()
# t-test
tstat,pval = stats.ttest_ind(df_Europe['Imports of goods and services (% of GDP)'],
df_Asia['Imports of goods and services (% of GDP)'])
print(f'T-test statistics:{tstat}')
print(f'T-test p-value:{pval}')
T-test statistics:-1.0979607386870058 T-test p-value:0.2735222291887188
Question3: What is the country (or countries) that has the highest 'Population density (people per sq. km of land area)' across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)
Answer: To find the country (or countries) with the highest 'Population density (people per sq. km of land area)' across all years, we can create a bar chart to visualize the average ranking of population density for each country across all time points in the dataset. Monaco and Macao SAR, China have the highest 'Population density (people per sq. km of land area)' across all years.
# calculate highest 'Population density'
df_rank = df[~df['Population density (people per sq. km of land area)'].isnull()]
df_rank = df_rank[['Year', 'Country Name', 'Population density (people per sq. km of land area)']]
df_rank['rank'] = df_rank.groupby('Year')['Population density (people per sq. km of land area)'].rank()
df_rank_country = df_rank[['Country Name', 'rank']]
df_rank_country = df_rank_country.groupby('Country Name').mean().reset_index()
df_rank_country= df_rank_country.sort_values('rank')
df_rank_country.columns = ['country', 'rank']
df_rank_country.head()
fig = px.bar(data_frame=df_rank_country, x = 'country', y ='rank')
fig.show()
highest_density = df_rank_country['rank'].max()
print(f'highest rank: {highest_density}')
highest_countries = (df_rank_country.loc[df_rank_country['rank']==highest_density])['country'].values.tolist()
print(f'country (or countries) that has the highest density across all years: {highest_countries}')
highest rank: 255.3 country (or countries) that has the highest density across all years: ['Monaco', 'Macao SAR, China']
Question4: What country (or countries) has shown the greatest increase in 'Life expectancy at birth, total (years)' between 1962 and 2007?
Answer: To find the country (or countries) that has shown the greatest increase in 'Life expectancy at birth, total (years)' between 1962 and 2007, we can calculate the difference in life expectancy between 2007 and 1962 for each country, and then sort the countries by this difference in descending order to identify the countries with the greatest increase. Maldives has shown the greatest increase in 'Life expectancy at birth, total (years)' between 1962 and 2007.
#calculate difference in life expectancy between 2007 and 1962 for each country and sort the countries by this difference in descending order
df_lei = df[df['Year'].isin([2007, 1962])]
df_lei = df_lei[['Country Name', 'Year', 'Life expectancy at birth, total (years)']]
df_lei.columns = ['country', 'year', 'le']
df_lei.head()
| country | year | le | |
|---|---|---|---|
| 0 | Afghanistan | 1962 | 33.219902 |
| 9 | Afghanistan | 2007 | 57.833829 |
| 10 | Albania | 1962 | 64.162854 |
| 19 | Albania | 2007 | 76.470293 |
| 20 | Algeria | 1962 | 47.045000 |
df_lei_pivot = df_lei.pivot(index='country', columns='year', values='le')
df_lei_pivot.reset_index(inplace=True)
df_lei_pivot['lei'] = df_lei_pivot[2007] - df_lei_pivot[1962]
df_lei_pivot = df_lei_pivot[~df_lei_pivot['lei'].isnull()]
df_lei_pivot = df_lei_pivot.sort_values('lei', ascending=False)
highest_lei_increase = df_lei_pivot['lei'].max()
print(f'highest life expectancy increase: {highest_lei_increase}')
highest_lei_countries = (df_lei_pivot.loc[df_lei_pivot['lei']==highest_lei_increase])['country'].values.tolist()
print(f'country (or countries) has shown the greatest increase in Life expectancy at birth between 1962 and 2007: {highest_lei_countries}')
fig_lei = px.bar(df_lei_pivot, x = 'country', y ='lei')
pio.renderers.default = "plotly_mimetype+notebook"
fig_lei.show()
highest life expectancy increase: 36.916146341499996 country (or countries) has shown the greatest increase in Life expectancy at birth between 1962 and 2007: ['Maldives']
# export notebook to html
os.system('jupyter nbconvert --to html python_for_data_science.ipynb')
[NbConvertApp] Converting notebook python_for_data_science.ipynb to html [NbConvertApp] Writing 8044509 bytes to python_for_data_science.html
0